Bosco Bakwatanisa
  • Education
  • Photography
  • CV
  • Study1

Exploring Sociodemographic and Health Associations in the NHANES 2017–March 2020 Dataset: Insights from Multivariable and Comparative Analyses

Author

Bosco Bakwatanisa

Published

2024-11-27

1. Setup and Data Ingest

1.1. Loading R packages

#load required R libraries
library(nhanesA)
library(forcats)
library(boot)
library(ggplot2)
library(ggpubr)
library(kableExtra)
library(janitor)
library(naniar)
library(patchwork)
library(gridExtra)
library(broom.mixed)
library(Epi)
library(vcd)
library(xfun)
library(epitools)
library(tidyverse)
theme_set(theme_light())  
knitr::opts_chunk$set(comment=NA)

2. Cleaning the Data

2.1. Data preparation

I downloaded the Blood Pressure data and Demographics data from 2017-March 2020 NHANES data

# pull in data from P_BPXO from NHANES and save it

 bp_raw <- nhanes('P_BPXO') |> tibble()
demo_raw <- nhanes('P_DEMO') |> tibble()
saveRDS(demo_raw, file = "data/P_DEMO.rds")
saveRDS(bp_raw, file = "data/P_BPXO.rds")

# Now that data are saved, I can just read in the tibble

bp_raw <- readRDS(file = "data/P_BPXO.rds")
demo_raw <- readRDS("data/P_DEMO.rds")
DEMO <- demo_raw%>%
  select(., SEQN,RIAGENDR,RIDAGEYR,DMDEDUC2,RIDRETH3,DMDMARTZ )
BPX <- bp_raw%>%
  select(., SEQN, BPXOSY1,BPXOSY2)
Analysis_1Data <- left_join(DEMO, BPX, by = "SEQN")
BMX_raw <- nhanes("P_BMX")|> tibble()   # Body measurements dataset
saveRDS(BMX_raw, file = "data/P_BMX.rds")
# Merge datasets on SEQN (participant ID)
saveRDS(Analysis_1Data,file = 'data/AnalysisA.rds')
BMX_raw<- readRDS("data/P_BMX.rds")
BMXBMI <- BMX_raw%>%select(SEQN,BMXBMI)
AnalysisB_data <- left_join(Analysis_1Data, BMXBMI, by = "SEQN")
saveRDS(AnalysisB_data, file = "data/AnalysisB_data.rds")
#Analysis C uses same dataset as above
#analysisD data download
SMQ_raw <- nhanes("P_SMQ")
AnalysisB_data <- readRDS('data/AnalysisB_data.rds')
smoke_data <- SMQ_raw%>% select(SEQN,SMQ020)
AnalysisD_data <- left_join(AnalysisB_data, smoke_data, by = "SEQN")
AnalysisData <- AnalysisD_data%>% filter( RIDAGEYR >=21 &  RIDAGEYR<= 79)

saveRDS(AnalysisData, file = 'data/AnalysesData.rds')

2. 1.1. Data cleaning and filtering for Analysis A

Here, I zeroed down to the Systolic Blood Pressure measurements for sessions 1 and 2 as the paired outcome variables for analysis A. This was co-selected with the SEQN columns just to ensure that systolic blood pressure is paired for each individual in the chosen population.

AnalysisData <- readRDS('data/AnalysesData.rds')
 analysisA_data <- AnalysisData%>% select(SEQN,BPXOSY1,BPXOSY2)%>% drop_na()%>%
  filter(BPXOSY1 >0, BPXOSY2 > 0)
saveRDS(analysisA_data, file = 'data/AnalysisA.rds')

2.1.2. Analysis B Data Cleaning, re-coding and filtering

Here I used already downloaded demographics and downloaded examination datasets (body measurement data) and then combined the two data sets for this analysis.

AnalysesData <- readRDS('data/AnalysesData.rds')
# Select relevant variables and filter complete cases
analysisB_data <-AnalysesData %>% 
  select(RIAGENDR, BMXBMI)%>%
  filter(complete.cases( RIAGENDR, BMXBMI))%>%
  mutate(Gender = ifelse(RIAGENDR == "Male", 1,2))%>% 
  rename(BMI = BMXBMI)# Add readable gender labels
analysisB_data$Gender <- as.factor(analysisB_data$Gender)
analysisB_data <- analysisB_data[,-1]
#taking alot at the summary stats of the analysis B data
saveRDS(analysisB_data, "data/analysisB.rds")

2.1.3. Analysis C Data Cleaning, re-coding, and filtering

AnalysesData <- readRDS('data/AnalysesData.rds')

analysisC_data <- AnalysesData%>%
  select(SEQN,RIDRETH3,BMXBMI)%>%rename(Race = RIDRETH3, BMI = BMXBMI)

analysisC_data <- analysisC_data%>% 
  mutate(Race = factor(
    Race,
    levels = c("Mexican American","Other Hispanic","Non-Hispanic White",    "Non-Hispanic Black","Non-Hispanic Asian"),
    labels = c("Mexican","Hispanic", "White","Black", "Asian")))%>%filter(complete.cases(Race,BMI))%>% filter(BMI > 0)
length(unique(analysisC_data$BMI))
[1] 443
saveRDS(analysisC_data, 'data/analysisC.rds')

2.1.4. Analysis D Data cleaning, filtering and re-coding

# Filter and recode SMQ020 for 'Yes' and 'No' only
AnalysesData <- readRDS('data/AnalysesData.rds')
analysisD_data <- AnalysesData %>% rename(Smoking = SMQ020, BMI = BMXBMI)%>%
  select(Smoking,BMI) %>%                 # Select relevant columns
  filter(Smoking %in% c('Yes', 'No'), complete.cases(Smoking,BMI)) %>% filter(BMI > 0)%>%# Keep 'Yes' and 'No'.
  mutate(
    Smoking = ifelse(Smoking =="Yes","Smoker","Non_Smoker"),    # Recode 'Yes' to 'Smoker' and 'No' to 'Non-Smoker'
    Obesity = ifelse(BMI >= 30, "Obese", "Non_Obese") # Define obesity based on BMI
  )%>%
  mutate(Smoking = factor(Smoking, levels = c("Smoker","Non_Smoker")),
         Obesity = factor(Obesity, levels = c("Obese","Non_Obese")))
  
saveRDS(analysisD_data, 'data/analysisD.rds')

2.1.5 Analysis E Data cleaning, filtering and re-coding

AnalysesData <- readRDS('data/AnalysesData.rds')
analysisE_data <- AnalysesData %>%
  select(SEQN,RIDRETH3,DMDEDUC2) %>%
  filter( RIDRETH3 %in% c('Mexican American', 'Other Hispanic','Non-Hispanic White','Non-Hispanic Black','Non-Hispanic Asian'),DMDEDUC2 %in% c('Less than 9th grade','9-11th grade (Includes 12th grade with no diploma)','High school graduate/GED or equivalent','Some college or AA degree','College graduate or above'), complete.cases(RIDRETH3,DMDEDUC2))%>%
  mutate(Race = factor(
    RIDRETH3,
    levels = c("Mexican American","Other Hispanic","Non-Hispanic White",    "Non-Hispanic Black","Non-Hispanic Asian"),
    labels = c("Mexican","Hispanic", "White","Black", "Asian")),
    Education = factor(
      DMDEDUC2,
      levels = c("Less than 9th grade","9-11th grade (Includes 12th grade with no diploma)", "High school graduate/GED or equivalent", "Some college or AA degree","College graduate or above"),
      labels = c( "less_9th_grade","9_12th_grade", "High_school","Some_College", "College_graduate" )
    ))%>% select(Race, Education)
saveRDS(analysisE_data, 'data/analysisE.rds')

3. Codebook and Data Description

variable_description <- tibble::tribble(
  ~Variable_Name, ~Role_in_Analysis,~Type, ~Original_Code, ~Definition, ~Cutpoints_or_Units, ~Year_Measured,
  "SEQN",  "ID",   "numeric",   " Respondent sequence number", "Unique Identifier to identify any persons involved in the study" ,  "N/A",      "2017-March 2020",
  "Gender",  "Analysis B Variable", "Categorical", "RIAGENDR","Gender of the participant","Persons (Male or Female)",      "2017-March 2020",
  "RIDAGEYR", "Used to select Adult persons aged 21-79 years","numeric","RIDAGEYR", "Participant's Age in years at the time of screening ",   "21 -79 years","2017-March 2020",   "Education",   "Analysis E variable",  "Categorical" ,  "DMDEDUC2",    "This variable is the highest grade or level of education completed by adults 20 years and older. 
The response categories are: less than 9th grade education, 9-11th grade education (includes 12th grade and no diploma), High school graduate/GED, some college or associates (AA) degree, and college graduate or higher.",  "Education level - Adults 20+, from less than 9th-grade to College graduate", "2017-March 2020",
  "Race/Ethnicity", "Analysis E variable",  "Categorical", "RIDRETH3 ", "RIDRETH3: This is the race-ethnicity variable included in the demographics file since the 2011-2012 survey cycle to accommodate the oversample of Asian Americans. It was derived from responses to the survey questions on race and Hispanic origin.
Respondents who self-identified as “Mexican American” were coded as such (i.e., RIDRETH3=1) regardless of their other race-ethnicity identities. Otherwise, self-identified “Hispanic” ethnicity would result in code “2, Other Hispanic” in the RIDRETH3 variable.
All other non-Hispanic participants would then be categorized based on their self-reported races: non-Hispanic white (RIDRETH3=3), non-Hispanic black (RIDRETH3=4), non-Hispanic Asian (RIDRETH3=6)", "Number of participants who identified as Race/Hispanic origin w/ NH Asian", "2017-March 2020",
  "BPXOSY1 ",   "Analysis A-variable","numeric", "BPXOSY1", "Systolic - 1st oscillometric reading",   "(mm Hg)", "2017-March 2020",
  "BPXOSY2","Analysis A-variable",  "numeric", "BPXOSY2", "Systolic - 2nd oscillometric reading","(mm Hg)", "2017-March 2020",
  "BMI","Analysis variable", "numeric", "BMXBMI", "Body Mass Index (BMI) was calculated as weight in kilograms divided by height in meters squared, and then rounded to one decimal place.",  "(kg/m**2)", "2017-March 2020",
  "Smoking", "Analysis D-variable",  "Categorical(yes, no)", "SMQ020", "Smoked at least 100 cigarettes in life, (yes for smoked, no never smoked","Number of participants interviewed", "2017-March 2020")
variable_description |> kbl() |> kable_styling()
Variable_Name Role_in_Analysis Type Original_Code Definition Cutpoints_or_Units Year_Measured
SEQN ID numeric Respondent sequence number Unique Identifier to identify any persons involved in the study N/A 2017-March 2020
Gender Analysis B Variable Categorical RIAGENDR Gender of the participant Persons (Male or Female) 2017-March 2020
RIDAGEYR Used to select Adult persons aged 21-79 years numeric RIDAGEYR Participant's Age in years at the time of screening 21 -79 years 2017-March 2020
Education Analysis E variable Categorical DMDEDUC2 This variable is the highest grade or level of education completed by adults 20 years and older. The response categories are: less than 9th grade education, 9-11th grade education (includes 12th grade and no diploma), High school graduate/GED, some college or associates (AA) degree, and college graduate or higher. Education level - Adults 20+, from less than 9th-grade to College graduate 2017-March 2020
Race/Ethnicity Analysis E variable Categorical RIDRETH3 RIDRETH3: This is the race-ethnicity variable included in the demographics file since the 2011-2012 survey cycle to accommodate the oversample of Asian Americans. It was derived from responses to the survey questions on race and Hispanic origin. Respondents who self-identified as “Mexican American” were coded as such (i.e., RIDRETH3=1) regardless of their other race-ethnicity identities. Otherwise, self-identified “Hispanic” ethnicity would result in code “2, Other Hispanic” in the RIDRETH3 variable. All other non-Hispanic participants would then be categorized based on their self-reported races: non-Hispanic white (RIDRETH3=3), non-Hispanic black (RIDRETH3=4), non-Hispanic Asian (RIDRETH3=6) Number of participants who identified as Race/Hispanic origin w/ NH Asian 2017-March 2020
BPXOSY1 Analysis A-variable numeric BPXOSY1 Systolic - 1st oscillometric reading (mm Hg) 2017-March 2020
BPXOSY2 Analysis A-variable numeric BPXOSY2 Systolic - 2nd oscillometric reading (mm Hg) 2017-March 2020
BMI Analysis variable numeric BMXBMI Body Mass Index (BMI) was calculated as weight in kilograms divided by height in meters squared, and then rounded to one decimal place. (kg/m**2) 2017-March 2020
Smoking Analysis D-variable Categorical(yes, no) SMQ020 Smoked at least 100 cigarettes in life, (yes for smoked, no never smoked Number of participants interviewed 2017-March 2020

4. Analysis A

4.1. Research Question

4.1.1.Problem statement

Research has shown that systolic blood pressure (SBP) is a critical measure for diagnosing hypertension and assessing cardiovascular risk. Repeated measurements are usually taken to improve accuracy and account for variability caused by circumstances such as white coat hypertension (temporary rise in blood pressure due to clinical environment), measurement error or technical variability and physiological changes during the measurement process. And therefore understanding the consistency or difference between successive SBP measurements can inform the protocol used for measuring blood pressure by highlighting any systematic bias in measurements and also help in identifying persons who at a risk of developing hypertension.

4.1.2. Research Question:

Is there a significant difference between the first and second oscillometric systolic blood pressure measurements (BPXOSY1 and BPXOSY2) in individuals as recorded in the NHANES 2017–March 2020 dataset?.

4.1.3. Hypotheses

4.1.3.1 Null Hypothesis \(({H_o})\)

There is no significant difference between the mean of the first systolic blood pressure measurement (BPXOSY1) and the second systolic blood pressure measurement (BPXOSY2)

\[{H_o} : {μ_{BPXOSY1} = μ_{BPXOSY2}}\]

4.1.3.2. Alternative Hypothesis

There is a significant difference between the mean of the first systolic blood pressure measurement (BPXOSY1) and the second systolic blood pressure measurement (BPXOSY2).

\[{H_A} : {μ_{BPXOSY1} \neq μ_{BPXOSY2}}\]

4.2. Describing The Data

Refer to the codebook in section 3 for the description of the variables (BPXOSY1 and BPXOSY2) used in this analysis. Here, I ran a data exploratory analysis and described the distribution of each of the two variables by looking at their statistical summaries (mean, median, sd, interquartile range, minimum, and maximum) and continued to provide some visualization graphics including histogram, boxplots, and Q-Q plots to provide a better understanding of the data.

analysisA_data <- readRDS('data/AnalysisA.rds')# read the
summary(analysisA_data %>% select(BPXOSY1,BPXOSY2))
    BPXOSY1         BPXOSY2     
 Min.   : 66.0   Min.   : 69.0  
 1st Qu.:111.0   1st Qu.:111.0  
 Median :121.0   Median :121.0  
 Mean   :124.3   Mean   :124.1  
 3rd Qu.:135.0   3rd Qu.:135.0  
 Max.   :225.0   Max.   :222.0  

4.2.1. Visualization

In order to have a clear understanding of the data variables in terms data distribution and checking for normality, I used the Histogram, Boxplot, and the Q-Q plot to display the data.

4.2.1.1. Histogram and Boxplot

From the visualizations above, the data

par(mfrow =c(2,2))
ggplot(analysisA_data, aes(x = log(BPXOSY1))) +
  geom_histogram(fill = "slateblue", col = "white",
                 bins = 20) + 
  labs(title = "LogTransformed-Systolic BP First Reading in `NHANES`",
       x = "log(Systolic BP First Reading (mm Hg))")+
   theme_classic(base_size = 18) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggplot(analysisA_data, aes(x = log(BPXOSY2))) +
  geom_histogram(fill = "orange", col = "white",
                 bins = 20) + 
  labs(title = "Systolic BP Second Reading in `NHANES`",
       x = "log(Systolic BP Second Reading(mm Hg))")+
   theme_classic(base_size = 18) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggplot(analysisA_data, aes(x = "", y = log(BPXOSY1))) +
  geom_boxplot(fill = "slateblue", outlier.size = 2,
               outlier.color = "slateblue") +
  #coord_flip() + 
  labs(y = "log(Systolic BP First Reading (mm Hg))")+
   theme_classic(base_size = 14) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggplot(analysisA_data, aes(x = "", y = log(BPXOSY2)))+
  geom_boxplot(fill = "orange", outlier.size = 2,
               outlier.color = "orange") +
  #coord_flip() + 
  labs(y = "log(Systolic BP Second Reading(mm Hg))")+
   theme_classic(base_size = 14) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggsave("LogTransHistogram_and_Boxplots.png")
4.2.1.2. Q-Q Plot
par(mfrow = c(1, 2))
ggplot(analysisA_data, aes(sample = log(BPXOSY1))) +
  geom_qq(col = "slateblue") + 
  geom_qq_line(col = "magenta") + 
  theme(aspect.ratio = 1) + 
  labs(title = "Normal Q-Q: NHANES log(BPXOSY1)")+
   theme_classic(base_size = 18) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggplot(analysisA_data, aes(sample = log(BPXOSY2))) +
  geom_qq(col = "orange") + 
  geom_qq_line(col = "black") + 
  theme(aspect.ratio = 1) + 
  labs(title = "Normal Q-Q: NHANES log(BPXOSY2)")+ theme_classic(base_size = 18) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggsave("Q-QPlots_for_systolic_blood_pressure.png")

From the Q-Q plot we see that our data is still right skewed even after log-transformation and therefore does not satisfy the normality requirements and hence I will apply non-parametric methods to compare the paired means

4.2.1.3 Scatter Plot Paired Sample Box plots
# Scatter plot of paired data
ggplot(analysisA_data, aes(x = log(BPXOSY1), y = log(BPXOSY2))) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Systolic Blood Pressure (First Reading vs Second Reading)",
       x = "BPXOSY1 (First Reading)", y = "BPXOSY2 (Second Reading)") +
  theme_classic(base_size = 18) +
  theme(
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

# Paired boxplot
analysisA_data %>% mutate(log_sbp1 = log(BPXOSY1), log_sbp2 = log(BPXOSY2))%>%
  pivot_longer(cols = c(log_sbp1, log_sbp2), names_to = "ReadingSession", values_to = "SystolicBloodPressure") %>%
  ggplot(aes(x = ReadingSession, y = SystolicBloodPressure, fill = ReadingSession)) +
  geom_boxplot() +
  labs(title = "Distribution of Blood Pressure (Paired Sessions)",
       x = "Reading-Session", y = "Blood Pressure (mmHg)") +
   theme_classic(base_size = 18) +
  theme(legend.position = "left",
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggsave("scatter plot_bp_sessions.png")
ggsave("Boxplot_Blood_Pressure_Sessions.png")
write.csv(analysisA_data, "data/Paired_Systolic_BP_data.csv")

4.3. Main Analysis

Here, I used Wilcoxon signed-rank sum test to compare the means of the systolic blood pressure between the two sessions since the data did not meet the normality requirements. Additionally, I used a bootstrap procedure to calculate a confidence interval for the mean difference.

4.3.1. Wilcoxon Signed-Rank Test

# Perform Wilcoxon signed-rank test
wilcox_result <- wilcox.test(log(analysisA_data$BPXOSY1), log(analysisA_data$BPXOSY2), paired = TRUE)
print(wilcox_result)

    Wilcoxon signed rank test with continuity correction

data:  log(analysisA_data$BPXOSY1) and log(analysisA_data$BPXOSY2)
V = 10992502, p-value = 0.009045
alternative hypothesis: true location shift is not equal to 0

4.3.2. Bootstrap Method

# Define bootstrap function
bootstrap_diff <- function(data, indices) {
  sample_data <- data[indices, ]
  mean(log(sample_data$BPXOSY1) - log(sample_data$BPXOSY2))
}
# Run bootstrap
set.seed(123)
boot_result <- boot(data = analysisA_data, statistic = bootstrap_diff, R = 1000)

# Confidence interval
boot_ci <- boot.ci(boot_result, type = "perc")
print(boot_ci)
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates

CALL : 
boot.ci(boot.out = boot_result, type = "perc")

Intervals : 
Level     Percentile     
95%   ( 0.0004,  0.0026 )  
Calculations and Intervals on Original Scale

4.4. Conclusions

From the results of the Wilcoxon signed rank, We observe that The p-value is very small (0.009), which is far below the common significance level (α=0.05), which implies that the true median difference between Systolic blood pressure values for the 2 oscillometric readings is not equal to zero. And hence there is a statistically significant difference in the paired samples (BPXOSY1 and BPXOSY2). The bootstrap analysis further confirms that there is a significant difference between the paired samples. The mean difference is estimated to be between 0.0004 and 0.0026 with 95% confidence, which is above zero. This supports the conclusion that the two paired systolic blood pressure are statistically different.

5. Analysis B

5.1. Question

5.1.1. Problem Statement

Body Mass Index (BMI) is a widely used indicator for assessing body fat based on height and weight. Gender differences in BMI may arise due to biological, hormonal, and societal factors influencing body composition, metabolism, lifestyle behaviors among others. Understanding how BMI varies with gender within a population may provide us with insights on how this association may influence the gender based health outcomes within the United States population.

5.1.2. Research Question: Is there any association between Body Mass Index and Gender?

5.1.3. Hypotheses

5.1.3.1. Null Hypothesis \(({H_o})\)

There is no association between Body Mass Index (BMI) and Gender in the United States population. The mean BMI of males is equal to the mean BMI for females; \[{H_o} : {μ_{MaleBMI} = μ_{FemaleBMI}}\]

5.1.3.2. Alternative Hypothesis \(({H_A})\):

There is an association between BMI and Gender in the United States population. The mean BMI for males is not equal to the mean BMI for female; \[{H_A} : {μ_{MaleBMI} \neq μ_{FemaleBMI}}\]

5.2. Describing The Data

Refer to the codebook in section 3 for the description of the variables (BMI) and Gender) used in this analysis. Here, I ran a data exploratory analysis and described the distribution of each of the two variables by looking at their statistical summaries (mean, median, sd, interquartile range, minimum, and maximum) and continued to provide some visualization graphics including histogram, boxplots, and Q-Q plots to provide a better understanding of the data.

5.2.1. Data summaries for the chosen variables for Analysis B

analysisB_data <- readRDS("data/analysisB.rds")
analysisB_data %>%
  group_by(Gender) %>%
  summarise(
    Mean_BMI = mean(BMI),
    SD_BMI = sd(BMI),
    IQR_BMI = IQR(BMI),
    n = n()
  )%>% kable(digits = 3)%>% kable_styling()
Gender Mean_BMI SD_BMI IQR_BMI n
1 29.571 6.631 7.8 3732
2 30.858 8.493 10.3 4004

5.2.2. Visualization of the data distribution

5.6.1 Boxplot, Histogram, Density plot, and Q-Q Plot

Here, I use a histogram, Q-Q plot, Boxplot, and the density plot to visualize the data distribution and assessment for normality.

par(mfrow = c(2, 2))
ggplot(analysisB_data, aes(x = BMI)) +
  geom_histogram(fill = "slateblue", col = "white",
                 bins = 20) + 
  labs(title = "Body Mass Index Distribution in `NHANES`",
       x = "Body Mass Index")+
  theme_classic(base_size = 18) +
  theme(legend.position = "left",
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

ggplot(analysisB_data, aes(sample = BMI)) +
  geom_qq(col = "slateblue") + 
  geom_qq_line(col = "magenta") + 
  theme(aspect.ratio = 1) + 
  labs(title = "Normal Q-Q: NHANES BMI")+
  theme_classic(base_size = 18) +
  theme(legend.position = "none",
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

#Boxplot
ggplot(analysisB_data, aes(x = Gender, y = BMI, fill = Gender)) +
  geom_boxplot() + coord_flip()+
  labs(title = "Comparison of BMI by Gender",
       x = "Gender",
       y = "Body Mass Index (BMI)") +
theme_classic(base_size = 18) +
  theme(legend.position = "left",
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

#density plot
ggplot(analysisB_data, aes(x = BMI, fill = Gender)) +
  geom_density(alpha = 0.5) +
  labs(title = "BMI Distribution by Gender",
       x = "Body Mass Index (BMI)",
       y = "Density") +
 theme_classic(base_size = 18) +
  theme(legend.position = "left",
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18))

5.3. Main Analysis

From the histogram and the Q-Q Normal plot, we observe that Body Mass index is right skewed and hence does not satisfy the normality requirement and hence in the subsequent analyses, I will use non parametric methods such as Wilcoxon rank-sum test and Bootstrap for calculating the confidence intervals.

5.3.1. Wilcoxon Rank-Sum Test

set.seed(112724)
#perform Wilcoxon Rank-sum test
wilcox_test = wilcox.test(BMI ~ Gender, data = analysisB_data)
glance(wilcox_test)%>% kable(digits = 3)%>% kable_styling()
statistic p.value method alternative
6984618 0 Wilcoxon rank sum test with continuity correction two.sided

5.3.2. Using Bootstrap Confidence Interval

I created function to use to compute the mean difference using the bootstrap method

#Define a bootstrap function
bootstrap_mean_diff <- function(data, indices){
  sample_data <- data[indices,]
  mean(sample_data$BMI[sample_data$Gender=="1"])- mean(sample_data$BMI[sample_data$Gender=="2"])
  
}

Running the bootstrap to compute the 90% confidence intervals for the independent sample mean difference.

# the bootstrap
set.seed(2024)
boot_result <- boot(data = analysisB_data, statistic = bootstrap_mean_diff, R =1000)
boot_ci = boot.ci(boot_result, type = "perc", conf = 0.90)
print(boot_ci)
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates

CALL : 
boot.ci(boot.out = boot_result, conf = 0.9, type = "perc")

Intervals : 
Level     Percentile     
90%   (-1.555, -1.014 )  
Calculations and Intervals on Original Scale

5.3.3. Visualize the Results

I used the Boxplot/Violin plot to display the mean difference for better comparison

  ggplot(analysisB_data, aes(x = Gender, y = BMI, fill = Gender)) +
  geom_violin(trim = FALSE, alpha = 0.4, color = "coral", lwd = 0.5) + 
  geom_boxplot(width = 0.4, outlier.shape = NA, alpha = 0.7, color = "gray20") + 
  stat_summary(fun = mean, geom = "point", shape = 23, size = 4, color = "black", fill = "yellow") +
  stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = 0.3, color = "black")+   
    coord_flip() +
  labs(title = "Comparison of BMI by Gender",
       caption = "Data Source: NHANES 2017-March 2020",
       x = "Gender", 
       y = "Body Mass Index (BMI)") +
  theme_classic(base_size = 18) +
  theme(legend.position = "none",
        axis.text = element_text(face = "bold",size = 18),
        axis.title.y = element_text(face = "bold", size = 18),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 20),
        plot.subtitle = element_text(hjust = 0.5,size = 18)) +
  scale_fill_brewer(palette = "Paired")

5.4. Conclusions

The mean BMI for males was 29.571(sd = 6.631), and the mean BMI for females was 30.858 (sd = 8.493). The Wilcoxon rank-sum test indicated that there is a significant difference between mean BM1 between males and females (p <0.05). Bootstrap resampling provided a confidence interval of [-1.555, -1.014], since the confidence bounds lie below 0 and are both negative indicate that the male group has a significantly lower mean than the female group.

6. Analysis D

6.1. Question

6.1.1. Problem Statement

Smoking and obesity are two significant risk factors for numerous chronic diseases, including cardiovascular diseases, diabetes, and certain cancers. Investigating the relationship between these two factors is essential to understand whether smoking status (current smoker vs. non-smoker) is associated with obesity status (obese vs. non-obese). By dividing the population into four distinct groups based on these binary categorical variables, we aim to assess the interaction between smoking and obesity and evaluate any potential public health implications.

6.1.2. Research Question

Is there a significant association between smoking status and obesity status in the US population, as represented by the NHANES 2017-March 2020 data?

6.1.3. Hypotheses

6.1.3.1 Null Hypothesis \(({H_o})\)

There is no association between smoking status (current smoker vs. non-smoker) and obesity status (obese vs. non-obese) in the population. The observed frequencies in the 2x2 table are consistent with the assumption of independence.

6.1.3.2. Alternative hypothesis \(({H_A})\)

There is a significant association between smoking status (current smoker vs. non-smoker) and obesity status (obese vs. non-obese) in the population. The observed frequencies in the 2x2 table deviate significantly from what is expected under the assumption of independence.

6.2. Describing The Data

Refer to the codebook in section 3 for the description of the variable SMQ020 and Body Mass Index (BMXBMI). However, for this analysis BMI was converted into a binary variable with people having BMI > 30 considered to be obese and those with BMI < 30 considered non-obese and as earlier descibed smoking was binarized into non-smoker and smoker. Here, I ran a data exploratory analysis and described the distribution of each of the two variables by looking at their statistical summaries, 2x2 table and continued to provide some visualization graphics including bar plot, mosaic plot, and boxplots to provide a better understanding of the data.

6.2.1. Data Summaries

analysisD_data |>
tabyl(Smoking,Obesity) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns(position = "front")%>% kable()%>%kable_styling()
Smoking Obese Non_Obese
Smoker 1,442 (44.8%) 1,775 (55.2%)
Non_Smoker 1,948 (43.1%) 2,567 (56.9%)
Total 3,390 (43.8%) 4,342 (56.2%)

6.2.2. Data Visualization

6.2.2.1. Bar plot
analysisD_data <- readRDS("data/analysisD.rds")
ggplot(analysisD_data, aes(x = Smoking, fill = Obesity))+
  geom_bar(position = "dodge")+
  labs(
    title = "Smoking Status vs. Obesity",
    x = "Smoking Status",
    y = "Count",
    fill = "Obesity"
  )+
  theme_classic(base_size = 16) +
  theme(legend.position = "right",
        axis.title.y = element_text(face = "bold", size = 18),
        axis.text = element_text(face ="bold", size = 16),
        axis.title.x = element_text(face = "bold", size = 18),
        plot.title = element_text(face = "bold", hjust = 0.5, size =))

The analysis examined the association between smoking status (smoker vs. non-smoker) and obesity (obese vs. non-obese) using NHANES 2017-March 2020 data. The contingency table revealed a total of3217 smokers and 4515 non-smokers, 3390 obese people and 4342 non-obese people.

6.2.2.2. Mosaicplot from Contingency Table
contigency_table <- table(analysisD_data$Smoking,analysisD_data$Obesity)
mosaicplot(
  contigency_table,
  main ="Mosaic Plot of Smoking and Obesity", color = TRUE
)

Now, it’s helpful to show our 2 x 2 table. We see there were 1,442 (44.8%) of obese smokers among 3217 smokers and 1948 ( 43.1%) obese people among 4515 non-smokers.

6.3. Main Analysis

6.3.1. Chi-square test

#contigency_table <- analysisD_data %>% tabyl(Smoking, Obesity)
chiSq_test <-chisq.test(contigency_table)
glance(chiSq_test) %>% kable(digits = 3) %>% kable_styling()
statistic p.value parameter method
2.084 0.149 1 Pearson's Chi-squared test with Yates' continuity correction

6.3.2. Compute relative risk

Finally, we can obtain our estimate of relative risk, by pushing the relevant data into the Epi package’s twoby2 function, using a 90% confidence interval, as requested.

twoby2(analysisD_data$Smoking, analysisD_data$Obesity, conf.level = 0.90)
2 by 2 table analysis: 
------------------------------------------------------ 
Outcome   : Obese 
Comparing : Smoker vs. Non_Smoker 

           Obese Non_Obese    P(Obese) 90% conf. interval
Smoker      1442      1775      0.4482    0.4339   0.4627
Non_Smoker  1948      2567      0.4315    0.4194   0.4436

                                   90% conf. interval
             Relative Risk: 1.0389    0.9955   1.0843
         Sample Odds Ratio: 1.0705    0.9918   1.1556
Conditional MLE Odds Ratio: 1.0705    0.9907   1.1568
    Probability difference: 0.0168   -0.0020   0.0356

             Exact P-value: 0.1431 
        Asymptotic P-value: 0.1424 
------------------------------------------------------

The risk of being Obese for smokers relative to non-smokers is 1.0389 (90% CI:0.9955,1.0843), which implies that smokers are 3.89% more likely to be obese compared to non-smokers and since the 90% confidence intervals includes 1, this result is not statistically significant.

The Odds of being obese for smokers relative to non-smokers is 1.0705 (90% CI: 0.9918,1.1556). This implies that smokers have 7.05% higher odds of being obese compared to non-smokers. The confidence interval includes 1, indicating no statistical significance. The conditional maximum likelihood estimate odds ratio (MLE Odds Ratio) is 1.0705 (90%CI:0.9907,1.1568). This is consistent with the sample Odds Ratio, confirming that there is no statistically significant association.

Additionally, the absolute difference in probability of being obese between smokers and non-smokers is 0.0168(90%CI: -0.002,0.0356), which highlights that smokers have a 1.68% higher probability of being obese compared to non-smokers. Further highlighting that there is no statistically significant association between smoking status and obesity as is also t indicated by exact p-value (p =0.1431) and asymptotic p-value (p=0.1424) being greater than 0.05 and the chi-square test (p=0.149).

6.4. Conclusions.

From the statistical analysis test conducted under this test analysis, we did not observe a statistically significant association between smoking status and obesity. However, this association may be influenced by other confounding factors such as lifestyle, physical activity, diet and other socio-economic factors.

7. Analysis E

7.1. Question

7.1.1. Research Question

Is there a statistically significant relationship between race/ethnicity and education level in the NHANES population using a statistical analysis of a contingency table?

7.1.2. Problem Statement

Understanding the relationship between race/ethnicity and education level is critical for addressing systemic disparities in access to educational opportunities. Education level is a key determinant of socioeconomic status, health outcomes, and overall quality of life. Similarly, race and ethnicity often intersect with structural barriers that shape access to education. Examining this relationship can provide insights into equity gaps and inform policy interventions.

7.1.3. Hypothesis

7.1.3.1. Null Hypothesis \(({H_0})\):

There is no association between race/ethnicity and education level in the population represented by the 2017-March 2020 NHANES population data. The observed frequencies in the J x K table are consistent with what would be expected if the variables were independent.

7.1.3.2. Alternative Hypothesis \(({H_A})\):

There is an association between race/ethnicity and education level in the population. The observed frequencies in the J x K table deviate significantly from what would be expected under the assumptions of independence.

7.2. Describing The Data

The data used for this analysis has been described in the codebook in section 3. However, this analysis unravels the association between two categorical variables of race/ethnicity and education level in the NHANES 2017-March 2020 population data set. Race/ethnicity is a categorical variable with 5 levels including Mexican, Hispanic, White, Black and American re-coded from the 6 levels originally present in the RIDRETH3 variable whereas Education level is also a categorical variable with 5 levels too, which have been re-coded to less_9th grade, 9_12th_grade, High_school, Some_college, and College_graduate, describing the 5 of the 6 levels of the original ‘DMDEDUC2’ variable in the 2017-March 2020 NHANES dataset.

7.2.1. Data summary

analysisE_data <-readRDS('data/analysisE.rds')
summary(analysisE_data)
       Race                 Education   
 Mexican :1008   less_9th_grade  : 628  
 Hispanic: 889   9_12th_grade    : 881  
 White   :2693   High_school     :1905  
 Black   :2337   Some_College    :2566  
 Asian   :1074   College_graduate:2021  

From the summary statistics above we observed that there were more non-Hispanic white people than any other race in the dataset and the least represented group were other Hispanic. Notably, there were more people with a Some college degree followed by college graduates and there fewer people having a below 9th grade education.

analysisE_data |>
tabyl(Race,Education) |>
adorn_totals("row") |>
adorn_percentages("row") |>
adorn_pct_formatting() |>
adorn_ns(position = "front")%>% kable()%>%kable_styling()
Race less_9th_grade 9_12th_grade High_school Some_College College_graduate
Mexican 266 (26.4%) 170 (16.9%) 230 (22.8%) 250 (24.8%) 92 (9.1%)
Hispanic 182 (20.5%) 133 (15.0%) 182 (20.5%) 245 (27.6%) 147 (16.5%)
White 51 (1.9%) 219 (8.1%) 679 (25.2%) 1,007 (37.4%) 737 (27.4%)
Black 52 (2.2%) 289 (12.4%) 672 (28.8%) 875 (37.4%) 449 (19.2%)
Asian 77 (7.2%) 70 (6.5%) 142 (13.2%) 189 (17.6%) 596 (55.5%)
Total 628 (7.8%) 881 (11.0%) 1,905 (23.8%) 2,566 (32.1%) 2,021 (25.3%)

It was observed that Mexican American had the least education with at least 26.4% having 9th grade or below whereas Non-Hispanic Asians were the most educated with at least 55.5% of all Non-Hispanic Asian being a college graduate. This can be clearly visualized on the bar plot and the mosaic plots below.

7.2.2. Visualization

7.2.2.1 Bar plot

I used the Bar graph to show the distribution of of education level frequency/count across the different races.

ggplot(analysisE_data, aes(x = Race, fill = Education))+
  geom_bar(position = "dodge")+
  labs(
    title = "Education level by Race/Ethnicity",
    x = "Race",
    y = "Count",
    fill = "Education level"
  )+
  theme_classic(base_size = 16) +
  theme(legend.position = "left",
        axis.title.y = element_text(face = "bold", size = 16),
        axis.text = element_text(face ="bold", size = 16),
        axis.title.x = element_text(face = "bold", size = 16),
        plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
        plot.subtitle = element_text(hjust = 0.5,size = 16))

7.2.2.2 Mosaic plot
# Create a mosaic plot with `vcd::mosaic`
mosaic(
  ~ Race + Education, 
  data = as.data.frame(analysisE_data),
  shade = TRUE,
  labeling_args = list(
    gp_labels = gpar(fontsize = 20),    # Increase cell label size
    gp_varnames = gpar(fontsize = 20), # Increase axis labels size
    gp_main = gpar(fontsize = 20)      # Increase title size
  ),
  main = "Race vs Education Level"
)

7.3. Main Analysis

Basically, I applied the Chi-Square Test of independence to assess whether there is a significant association between race/ethnicity and education level in the population using the data as described

7.3.1. Chi-Square T test of Independence

analysisE_contigency_table <- analysisE_data %>% tabyl(Race, Education)
cell_counts <- as.data.frame(analysisE_contigency_table)
analysisE_chisq <- chisq.test(analysisE_contigency_table)
analysisE_chisq %>% glance() %>% kable()%>%kable_styling()
statistic p.value parameter method
1702.674 0 16 Pearson's Chi-squared test
residuals <- analysisE_chisq$stdres
print(residuals)
     Race less_9th_grade 9_12th_grade High_school Some_College College_graduate
  Mexican     23.4109832     6.350916  -0.7910144    -5.289147       -12.608953
 Hispanic     14.8438507     3.990123  -2.4778125    -3.057076        -6.349636
    White    -14.1079856    -5.859656   2.1002162     7.265012         3.090912
    Black    -12.0149837     2.487403   6.6712304     6.610652        -7.995896
    Asian     -0.8899613    -5.055874  -8.7556165   -10.921523        24.508110

7.3.2. Convert residuals to a Heatmap

residuals_df <- as.data.frame(residuals)
colnames(residuals_df) <- c("Race", "Education", "Residual")

ggplot(residuals_df, aes(x = Education, y = Race, fill = Residual)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  labs(title = "Standardized Residuals Heatmap", x = "Education Level", y = "Race") +
  theme_minimal()

From the Chi-square residual heatmap, we observe that the Non-Hispanic population is overrepresented in attaining higher education as indicated by large positive residual (red), also we see that the Non-Hispanic Black population is overrepresented at lower education category while being underrepresented in the higher education category. Non-Hispanic Asians are under shown to be less represented at lower education levels suggesting they are less likely to fall into the lower education category whereas Mexican American and Other Hispanic groups vary, but with significant deviations in some education levels highlighting over-representation or underrepresentation.

7.4. Conclusions

These findings suggest that there is a significant relationship between race/ethnicity and education levels, which supports for further exploration of the systemic or cultural factors driving these disparities.

8. Session Information

sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] lubridate_1.9.3     stringr_1.5.1       dplyr_1.1.4        
 [4] purrr_1.0.2         readr_2.1.5         tidyr_1.3.1        
 [7] tibble_3.2.1        tidyverse_2.0.0     epitools_0.5-10.1  
[10] xfun_0.47           vcd_1.4-12          Epi_2.53           
[13] broom.mixed_0.2.9.6 gridExtra_2.3       patchwork_1.2.0    
[16] naniar_1.1.0        janitor_2.2.0       kableExtra_1.4.0   
[19] ggpubr_0.6.0        ggplot2_3.5.1       boot_1.3-30        
[22] forcats_1.0.0       nhanesA_1.1        

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1    viridisLite_0.4.2   farver_2.1.2       
 [4] fastmap_1.2.0       digest_0.6.37       timechange_0.3.0   
 [7] lifecycle_1.0.4     survival_3.6-4      magrittr_2.0.3     
[10] compiler_4.4.1      rlang_1.1.4         tools_4.4.1        
[13] utf8_1.2.4          yaml_2.3.10         data.table_1.16.0  
[16] knitr_1.48          ggsignif_0.6.4      labeling_0.4.3     
[19] htmlwidgets_1.6.4   curl_6.0.1          RColorBrewer_1.1-3 
[22] plyr_1.8.9          xml2_1.3.6          abind_1.4-5        
[25] withr_3.0.1         foreign_0.8-86      numDeriv_2016.8-1.1
[28] fansi_1.0.6         colorspace_2.1-1    future_1.34.0      
[31] globals_0.16.3      scales_1.3.0        MASS_7.3-60.2      
[34] cli_3.6.3           etm_1.1.1           rmarkdown_2.28     
[37] ragg_1.3.2          generics_0.1.3      rstudioapi_0.16.0  
[40] tzdb_0.4.0          httr_1.4.7          splines_4.4.1      
[43] rvest_1.0.4         parallel_4.4.1      selectr_0.4-2      
[46] vctrs_0.6.5         Matrix_1.7-0        jsonlite_1.8.8     
[49] carData_3.0-5       car_3.1-2           hms_1.1.3          
[52] rstatix_0.7.2       visdat_0.6.0        listenv_0.9.1      
[55] systemfonts_1.1.0   cmprsk_2.2-12       glue_1.7.0         
[58] parallelly_1.38.0   codetools_0.2-20    stringi_1.8.4      
[61] gtable_0.3.5        lmtest_0.9-40       munsell_0.5.1      
[64] furrr_0.3.1         pillar_1.9.0        htmltools_0.5.8.1  
[67] R6_2.5.1            textshaping_0.4.0   evaluate_0.24.0    
[70] lattice_0.22-6      highr_0.11          backports_1.5.0    
[73] broom_1.0.6         snakecase_0.11.1    Rcpp_1.0.13        
[76] svglite_2.1.3       nlme_3.1-164        mgcv_1.9-1         
[79] zoo_1.8-12          pkgconfig_2.0.3    

© 2024 Bosco Bakwatanisa CC BY-SA 4.0